---
title: "Replication Code"
subtitle: "``Labeling Social Media Posts: Does Showing Coders Multimodal Content Produce Better Human Annotation, and a Better Machine Classifier?''"
output: 
  html_document:
    toc: true
date: "2025-02-18"
editor_options: 
  chunk_output_type: console
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

## Load Required Packages

```{r, message=FALSE, warning=FALSE}
library(tidyverse)
theme_set(theme_minimal())
library(lubridate)
library(stringr)
library(irr)
library(xtable)
```

```{r}
sessionInfo()
```

## Load and Clean Data

```{r, message=FALSE}
# Load all labeled tweets
d <- readRDS("Data/labeled_tweets_all_clean.rds")

# Load codebook
d_codebook <- read_csv("Data/codebook_clean.csv")
```

#### Recode Labels

```{r}

d_re <- d %>%
  transmute(
    tweet_id = tweet_id,
    coder = coder,
    week = week,
    treatment = treatment,
    n_emb = n_emb,
    n_txt = n_txt,

    Seriousness2 = case_when(
      `1` == 1 ~ "Serious",
      `2` == 1 ~ "Not Serious",
      TRUE ~ "Not Mentioned"
    ),

    `State of Economy` = ifelse(`6` == 1, "Mentioned", "Not Mentioned"),
    `Inequality` = ifelse(`7` == 1, "Mentioned", "Not Mentioned"),
    `Policy - Healthcare` = case_when(
      `9` == 1 ~ "Negative",
      `10` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),
    `Policy - Mask` = case_when(
      `11` == 1 ~ "Negative",
      `12` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),
    `Policy - Economic Relief` = case_when(
      `21` == 1 ~ "Negative",
      `22` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),
    `Government Evaluation - Federal` = case_when(
      `28` == 1 ~ "Neutral",
      `29` == 1 ~ "Negative",
      `30` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),
    `Government Evaluation - Trump` = case_when(
      `31` == 1 ~ "Neutral",
      `32` == 1 ~ "Negative",
      `33` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),
    `Government Evaluation - Governor` = case_when(
      `34` == 1 ~ "Neutral",
      `35` == 1 ~ "Negative",
      `36` == 1 ~ "Positive",
      TRUE ~ "Not Mentioned"
    ),

    `Irrelevant` = case_when(
      `74` == 1 | `75` == 1 | `77` == 1 ~ "Irrelevant",
      TRUE ~ "Not Mentioned"
    ),

    `Not enough information` = case_when(
      `76` == 1 ~ "Not enough information",
      TRUE ~ "Not Mentioned"
    )
  )


```


## Get Performance Metrics

### Additional Data Cleaning

```{r}
# Keep only tweets whose coders fully comply with the assignment
# Two scenarios: (a) 2 coders in each condition, (b) 1 coder in each condition
d <- d %>% filter((n_emb == 2 & n_txt == 2) | (n_emb == 1 & n_txt == 1))
```


### Initialize Table of Performance Metrics

```{r}
# Initiate a list to collect results
metrics_ls <- list()
```

### Get $\Delta T$

```{r}
# Calculate time differences between two labels
d_time <- d %>%
  mutate(label_timestamp = as_datetime(label_timestamp)) %>%
  mutate(valid_response = 1 - `76`) %>%
  select(tweet_id, coder, treatment, label_timestamp, valid_response) %>%
  arrange(coder) %>%
  group_by(coder) %>%
  arrange(label_timestamp) %>%
  mutate(time_use_sec = label_timestamp - lag(label_timestamp, 1)) %>%
  mutate(time_use_sec = as.numeric(time_use_sec))

# Get summary statistics of time_use_sec
quantile(d_time$time_use_sec, c(0.5, 0.75, 0.9, 0.95, 0.975, 0.99), na.rm = TRUE)

# Remove those that appear to be intervals between two work sessions
# Use 5 minutes as a threshold. That is 5 * 60 = 300
d_time <- d_time %>%
  filter(time_use_sec <= 300) %>%
  mutate(treatment = ifelse(treatment == 1, "Multimodal", "Text-only"))


```


```{r, results='hide'}
# Bootstrap 95% confidence intervals for the treated and control group respectively
n_boot <- 1000
sum_time_boot <- list()
set.seed(123)
pb <- txtProgressBar(min = 0, max = n_boot, style = 3)
for (i in 1:n_boot){
  d_boot <- d_time %>% 
    group_by(coder) %>%
    sample_frac(1, replace = TRUE) %>%
    ungroup()
  
  sum_time_boot[[i]] <- d_boot %>%
    group_by(treatment) %>%
    summarise(time_use_sec = mean(time_use_sec, na.rm = TRUE)) %>%
    mutate(boot = i, .before = 1)
  setTxtProgressBar(pb, i)
}


```

```{r}
sum_time_boot_df <- bind_rows(sum_time_boot) %>%
  pivot_wider(names_from = "treatment", values_from = "time_use_sec") %>%
  mutate(DeltaT = (Multimodal - `Text-only`) / `Text-only`)

metrics_ls[["DeltaT"]] <- sum_time_boot_df %>%
  summarise(
    mean = mean(DeltaT),
    lo = quantile(DeltaT, 0.025),
    hi = quantile(DeltaT, 0.975)
  ) %>%
  mutate(Metrics = "DeltaT", .before = 1)

print(metrics_ls[["DeltaT"]])
```

### Get $\Delta T_v$

```{r,results='hide'}
## Bootstrap the confidence interval of Delta T_v

n_boot <- 1000
sum_time_boot <- list()
set.seed(123)
pb <- txtProgressBar(min = 0, max = n_boot, style = 3)
i <- 1
for (i in 1:n_boot){
  d_boot <- d_time %>% group_by(coder) %>% sample_frac(1, replace = TRUE)
  sum_time_boot[[i]] <- d_boot %>%
    group_by(treatment) %>%
    summarise(
      time_use_sec = sum(time_use_sec, na.rm = TRUE) / sum(valid_response, na.rm = TRUE)
      ) %>%
    mutate(boot = i, .before = 1)
  setTxtProgressBar(pb, i)
}

sum_time_boot_df <- bind_rows(sum_time_boot) %>%
  pivot_wider(names_from = "treatment", values_from = "time_use_sec") %>%
  mutate(DeltaT = (Multimodal - `Text-only`) / `Text-only`)

```

```{r}
metrics_ls[["DeltaTv"]] <- sum_time_boot_df %>%
  summarise(
    mean = mean(DeltaT),
    lo = quantile(DeltaT, 0.025),
    hi = quantile(DeltaT, 0.975)
  ) %>%
  mutate(Metrics = "DeltaTv", .before = 1)

metrics_ls[["DeltaTv"]]
```

### Get $\Delta R$

```{r,results='hide'}
d_validresponse <- d %>%
  mutate(treatment = ifelse(treatment == 1, "Multimodal", "Text-only")) %>%
  # group_by(treatment) %>%
  mutate(valid_response = 1 - `76`) %>%
  select(tweet_id, coder, treatment, valid_response) %>%
  arrange(tweet_id, coder, treatment)
  
# Bootstrap 95% confidence intervals for the treated and control group respectively
n_boot <- 1000
sum_validresponse_boot <- list()
set.seed(123)
pb <- txtProgressBar(min = 0, max = n_boot, style = 3)
for (i in 1:n_boot){
  d_boot <- d_validresponse %>% group_by(coder) %>% sample_frac(1, replace = TRUE)
  sum_validresponse_boot[[i]] <- d_boot %>%
    group_by(treatment) %>%
    summarise(valid_response = mean(valid_response, na.rm = TRUE)) %>%
    mutate(boot = i, .before = 1)
  setTxtProgressBar(pb, i)
}

sum_validresponse_boot_df <- bind_rows(sum_validresponse_boot) %>%
  pivot_wider(names_from = "treatment", values_from = "valid_response") %>%
  mutate(DeltaValidResponse = (Multimodal - `Text-only`) / `Text-only`)

```

```{r}
metrics_ls[["DeltaValidResponse"]] <- sum_validresponse_boot_df %>%
  summarise(
    mean = mean(DeltaValidResponse),
    lo = quantile(DeltaValidResponse, 0.025),
    hi = quantile(DeltaValidResponse, 0.975)
  ) %>%
  mutate(Metrics = "DeltaValidResponse", .before = 1)

metrics_ls[["DeltaValidResponse"]]
```



### Get $\Delta I$

```{r}

run_bootstrap <- FALSE
# Note: Bootstrap results have been run and saved. Change to TRUE to run the bootstrapping
# However, it will be a long process if you do so.


N_BOOTSTRAP = 1000

if (run_bootstrap){
  d_irr <- d %>%  
    filter((n_emb == 2 & n_txt == 2)) %>%
    transmute(
      tweet_id = tweet_id, 
      coder = coder,
      treatment = treatment,
      
      Seriousness2 = case_when(
        `1` == 1 ~ "Serious",
        `2` == 1 ~ "Not Serious",
        TRUE ~ "Not Mentioned"
      ),

      `State of Economy` = ifelse(`6` == 1, "Mentioned", "Not Mentioned"),
      `Inequality` = ifelse(`7` == 1, "Mentioned", "Not Mentioned"),
      `Policy - Healthcare` = case_when(
        `9` == 1 ~ "Negative",
        `10` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      `Policy - Mask` = case_when(
        `11` == 1 ~ "Negative",
        `12` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      `Policy - Economic Relief` = case_when(
        `21` == 1 ~ "Negative",
        `22` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      `Government Evaluation - Federal` = case_when(
        `28` == 1 ~ "Neutral",
        `29` == 1 ~ "Negative",
        `30` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      `Government Evaluation - Trump` = case_when(
        `31` == 1 ~ "Neutral",
        `32` == 1 ~ "Negative",
        `33` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      `Government Evaluation - Governor` = case_when(
        `34` == 1 ~ "Neutral",
        `35` == 1 ~ "Negative",
        `36` == 1 ~ "Positive",
        TRUE ~ "Not Mentioned"
      ),
      
      `Irrelevant` = case_when(
        `74` == 1 | `75` == 1 | `77` == 1 ~ "Irrelevant",
        TRUE ~ "Not Mentioned"
      ),
      
      `Not enough information` = case_when(
        `76` == 1 ~ "Not enough information",
        TRUE ~ "Not Mentioned"
      )
    )
  
  
  d_irr_t <- d_irr %>%
    arrange(tweet_id, treatment) %>%
    group_by(tweet_id, treatment) %>%
    mutate(id_within_treatment = row_number(), .after = treatment) %>%
    ungroup() %>%
    mutate(treatment = ifelse(treatment == 1, "Embed", "Text")) %>%
    mutate(coding_id = str_c(treatment, "_", id_within_treatment), .after = id_within_treatment) %>%
    select(-treatment, -id_within_treatment, -coder)
  
  groups <- unique(d_irr_t$coding_id)
  groups_comb <- combn(groups, 2, simplify = TRUE)
  
  tweet_id_unique <- unique(d_irr_t$tweet_id)
  variables <- names(d_irr)[-(1:3)]
  
  results <- list()
  
  l <- 1
  i <- 1
  set.seed(23)
  pb <- txtProgressBar(min = 0, max = N_BOOTSTRAP, style = 3)
  for (i in 1:N_BOOTSTRAP){
    tweet_id_boot <- sample(tweet_id_unique, length(tweet_id_unique), replace = TRUE)
    
    d_irr_t_bt <- tibble(tweet_id = tweet_id_boot) %>%
      mutate(uid = row_number(), .before = 1) %>%
      left_join(d_irr_t, by = "tweet_id", relationship = "many-to-many") %>%
      select(-tweet_id)
    
    j <- 1
    for (j in seq_along(variables)){
      variable <- variables[j]
      
      mat <- d_irr_t_bt %>%
        select(uid, coding_id, !!(variable)) %>%
        rename("variable" = !!variable) %>%
        pivot_wider(id_cols = "uid", names_from = "coding_id", values_from = "variable") %>%
        select(-uid)
      
      k <- 1
      for (k in 1:ncol(groups_comb)){
        mat_s <- mat[, groups_comb[,k]]
        fleiss_k <- irr::kappam.fleiss(mat_s, detail = TRUE)
        fk <- fleiss_k$value
        
        results[[l]] <- tibble(
          bootstrap_id = i,
          `Sub-task` = variables[j],
          Coder1 = groups_comb[1,k],
          Coder2 = groups_comb[2,k],
          fk = fk
        )
        l <- l + 1
      }
    }
    setTxtProgressBar(pb, i)
  }
  results_df <- bind_rows(results)
  write_rds(results_df, "Data/processed/coder_results.rds")
}

```

```{r}
## Analyze results ------

sum_irr_raw <- read_rds("Data/processed/coder_results.rds")

# Recode "Seriousness2" to "Seriousness"

sum_irr_raw <- sum_irr_raw %>%
  mutate(`Sub-task` = recode(`Sub-task`, "Seriousness2" = "Seriousness"))

variables_select <- c("Seriousness", 
                      "State of Economy", "Inequality", 
                      "Policy- Healthcare", "Policy - Mask", "Policy - Economic Relief",
                      "Government Evaluation - Federal", "Government Evaluation - Trump",
                      "Government Evaluation - Governor")

sum_irr <- sum_irr_raw %>%
  filter(`Sub-task` %in% variables_select) %>%
  mutate(Coder1 = str_remove(Coder1, "_\\d+"),
         Coder2 = str_remove(Coder2, "_\\d+")) %>%
  filter(!(Coder1 == "Text" & Coder2 == "Text")) %>%
  mutate(Group = case_when(
    Coder1 == "Embed" & Coder2 == "Embed" ~ "I(Multimodal, Multimodal)",
    Coder1 != Coder2 ~ "I(Multimodal, Text-only)",
    TRUE ~ NA), .after = Coder2
  )

sum_irr_wide <- sum_irr %>%
  group_by(bootstrap_id, Group) %>%
  summarise(fk = mean(fk)) %>%
  pivot_wider(names_from = "Group", values_from = "fk") %>%
  mutate(DeltaKappa = (`I(Multimodal, Multimodal)` - `I(Multimodal, Text-only)`) / `I(Multimodal, Text-only)`) 

# Save the results
metrics_ls[["DeltaKappa"]] <- sum_irr_wide %>%
  ungroup() %>%
  summarise(
    mean = mean(DeltaKappa),
    lo = quantile(DeltaKappa, 0.025),
    hi = quantile(DeltaKappa, 0.975)
  ) %>%
  mutate(Metrics = "DeltaKappa", .before = 1)

metrics_ls[["DeltaKappa"]]
```

### Get $\Delta P$

```{r, message=FALSE}
PATH_MODEL_OUTPUT <- "Data/processed/classifier_results"

model_names <- list.files(PATH_MODEL_OUTPUT)

sum_classifier_ls <- list()

for (i in seq_along(model_names)){
  tmp_d <- read_csv(file.path(PATH_MODEL_OUTPUT, model_names[i]))
  tmp_d <- tmp_d %>%
    separate(model_name, c("model_type", "subset", "method", "random_seed"), sep = " ", remove = FALSE)
  
  sum_classifier_ls[[i]] <- tmp_d
}
rm(tmp_d)

vars <- c("model_name", "model_type", "subset", "method", "random_seed", "global_step")

sum_classifier_df <- do.call(bind_rows, sum_classifier_ls) %>%
  select(!!!syms(vars), everything()) %>%
  group_by(!!!syms(vars)) %>%
  filter(row_number() == 1) %>% 
  mutate(method = recode(method, "emb" = "Embedded", "txt" = "Text-Only"),
         subset = gsub("^Sub_", " ", subset))

sum_classifier_df_s <- sum_classifier_df %>% 
  group_by(model_name) %>%
  arrange(desc(f1_macro)) %>%
  filter(row_number() == 1) %>%
  ungroup()

sum_classifier_df_multilabel <- sum_classifier_df_s %>%
  filter(model_type == "Multi-label") %>%
  select(subset, random_seed, method, f1_macro) %>%
  pivot_wider(names_from = "method", values_from = "f1_macro") %>%
  mutate(DeltaP = (Embedded - `Text-Only`) / `Text-Only`)

# Save the results
metrics_ls[["DeltaP"]] <- sum_classifier_df_multilabel %>%
  drop_na() %>%
  summarise(
    mean = mean(DeltaP),
    lo = quantile(DeltaP, 0.025),
    hi = quantile(DeltaP, 0.975)
  ) %>%
  mutate(Metrics = "DeltaP", .before = 1)

metrics_ls[["DeltaP"]]
```


## Main Results

```{r}
metrics_df <- bind_rows(metrics_ls)
metrics_df$Metrics <- factor(metrics_df$Metrics, levels = rev(metrics_df$Metrics))
```

### REPLICATE FIGURE 2

```{r, fig.width=6, fig.height=3}
metrics_df %>%
  ggplot(aes(x = Metrics, y = mean, ymin = lo, ymax = hi)) +
  geom_pointrange() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_y_continuous(labels = scales::percent_format()) +
  scale_x_discrete(
    labels = c(
      "DeltaT" = expression(Delta * "T"),
      "DeltaTv" = expression(Delta * "T" * "v"),
      "DeltaValidResponse" = expression(Delta * "R"),
      "DeltaKappa" = expression(Delta * "I"),
      "DeltaP" = expression(Delta * "P")
    )
  ) +
  coord_flip() +
  labs(x = "", y = "Percentage Change due to Multimodal Labeling (95% CI)")

ggsave("Figures_Tables/FIGURE_2.pdf", width = 6, height = 3)
```

## Additional Results in Appendices

### REPLICATE TABLE A-1

```{r}
# Make a matrix of coders' labeling assignments
coders <- unique(d$coder)

colabel <- d %>% 
  select(tweet_id, coder, week) %>%
  mutate(dummy = 1) %>%
  pivot_wider(names_from = "coder", values_from = "dummy", values_fill = 0)

colabel_mat <- colabel %>% 
  select(all_of(coders)) %>% 
  as.matrix()

colabel_out <- t(colabel_mat) %*% colabel_mat

xtable(colabel_out, digits = 0, 
       caption = "Assignment Overlap between Coders") %>% 
  print(size = "\\footnotesize", file = "Figures_Tables/Table_A-1.tex")

colabel_out %>% knitr::kable()
```


### REPLICATE TABLE A-2

```{r}
metrics_df %>%
  select(Metrics, mean, lo, hi) %>%
  set_names(c("Metrics", "Mean", "5th Percentile", "95th Percentile")) %>%
  knitr::kable(digits = 2)
```

```{r, results='hide'}
# To LaTeX table
metrics_df %>%
  select(Metrics, mean, lo, hi) %>%
  xtable(caption = "Performance metrics") %>%
  print(tabular.environment = "table", floating = FALSE, 
        include.rownames = FALSE)
```

### REPLICATE TABLE A-3

```{r}
label_names <- paste0(d_codebook$category, " - ", d_codebook$label)

sum_num_labels <- d %>%
  select(treatment, `1`: `77`) %>%
  pivot_longer(cols = `1`:`77`, names_to = "label", values_to = "value") %>%
  group_by(treatment, label) %>%
  summarise(n = sum(value)) %>%
  pivot_wider(names_from = "treatment", values_from = "n") %>%
  mutate(label = as.integer(label)) %>%
  arrange(label) %>%
  rename(codebook_id = label) %>%
  rename(`Text-Only` = `0`, `Multimodal` = `1`)

sum_num_labels <- d_codebook %>%
  select(codebook_id, category, label) %>%
  unite("Label", category, label, sep = " - ") %>%
  left_join(sum_num_labels, by = "codebook_id") %>%
  select(-codebook_id)

sum_num_labels <- sum_num_labels %>%
  filter(!str_detect(Label, "Irrelevant"))

```

```{r}
sum_num_labels %>%
  # Make a Latex Long table, exclude row names
  xtable(caption = "Number of labels by treatment and label") %>%
  print(tabular.environment = "longtable", floating = FALSE, 
        include.rownames = FALSE, file = "Figures_Tables/Table_A-3.tex")

sum_num_labels %>% knitr::kable()
```

### REPLICATE TABLE A-4

```{r}
# Get tweets labeled as "not enough information" by at least one coder in the text-only condition
d_text_nei <- d %>% filter(treatment == 0 & `76` == 1) %>% 
  group_by(tweet_id) %>%
  count()

# Get tweets that meet the two criteria:
# (1) labeled as "not enough information" by at least one coder in the text-only condition
# (2) given a substantive label by at least one coder in multimodal condition

d_text_nei_multimodal_1 <- d %>% filter(treatment == 1) %>%
  filter(tweet_id %in% d_text_nei$tweet_id) %>%
  filter(`76` != 1) %>%
  group_by(tweet_id) %>%
  count()

# Get tweets labeled as "not enough information" by both coder in the text-only condition
d_text_nei_2 <- d_text_nei %>% filter(n == 2)

# Get tweets that meet the two criteria:
# (1) labeled as "not enough information" by both coders in the text-only condition
# (2) given a substantive label by at least one coder in multimodal condition

d_text_nei_multimodal_2 <- d %>% filter(treatment == 1) %>%
  filter(tweet_id %in% d_text_nei_2$tweet_id) %>%
  filter(`76` != 1) %>%
  group_by(tweet_id) %>%
  count()


tmp_output <- tribble(
  ~`NEI in text-only`, ~`Valid label in multimodal - At least one coder`, ~`Valid label in multimodal - Both coders`,
  
  "At least one coder", 
  
  # For tweets that AT LEAST ONE text-only coder labeled as "not enough information"
  # the proportion is labeled with a substantive label by AT LEAST ONE multimodal codders
  nrow(d_text_nei_multimodal_1) / nrow(d_text_nei), 

  # For tweets that at least one text-only coder labeled as "not enough information"
  # the proportion is labeled with a substantive label by BOTH multimodal codders
  nrow(d_text_nei_multimodal_1 %>% filter(n == 2)) / nrow(d_text_nei),

  "Both coders", 

  # For tweets that BOTH text-only coder labeled as "not enough information"
  # the proportion is labeled with a substantive label by AT LEAST ONE multimodal codder
  nrow(d_text_nei_multimodal_2) / nrow(d_text_nei_2), 

  # For tweets that BOTH text-only coder labeled as "not enough information"
  # the proportion that both multimodal coder gives it a substantive label.
  nrow(d_text_nei_multimodal_2 %>% filter(n == 2)) / nrow(d_text_nei_2)
)

knitr::kable(tmp_output, digits = 4)

rm(tmp_output)
```


### REPLICATE FIGURE A-1

```{r}
# Create indicator of group assignment
coders_group <- d %>% 
  select(coder) %>%
  distinct() %>%
  mutate(group = rep(1:3, each=4))

p <- d %>%
  select(coder, week, treatment) %>%
  mutate(coder = factor(coder, rev(sort(coders_group$coder)))) %>%
  mutate(treatment = ifelse(treatment == 1, "Multimodal Labeling", "Text-only Labeling")) %>%
  mutate(week = factor(week, levels = 1:5)) %>%
  group_by(coder, week, treatment) %>%
  summarise(N = n()) %>%
  ggplot(aes(x = week, y = coder)) +
  geom_text(aes(color = treatment, label = N)) +
  labs(x = "Week", y = "Coder", color = "Treatment Group") +
  theme(legend.position = "bottom")

p

ggsave("Figures_Tables/Figure_A-1.pdf", plot = p, width = 6, height = 4)
```

### REPLICATE FIGURE A-2

```{r}

# Average time used to label a post

d_time_sum <- d_time %>%
  group_by(treatment) %>%
  summarise(time_use_sec = mean(time_use_sec, na.rm = TRUE)) %>%
  mutate(label = paste0(treatment, " labeling on average took ", round(time_use_sec), "s"),
         y = c(0.06, 0.04))

p <- d_time %>%
  ggplot() +
  geom_density(aes(x = time_use_sec, fill = treatment), alpha = 0.4) +
  xlab("Time used to code a tweet (seconds)") +
  geom_vline(aes(xintercept = time_use_sec, color = treatment), linetype = "dashed", data = d_time_sum) +
  geom_text(aes(x = time_use_sec, y = y, label = label), data = d_time_sum, hjust = 0) +
  theme(legend.position = "bottom", legend.title = element_blank())

p

ggsave("Figures_Tables/Figure_A-2.pdf", plot = p, width = 5, height = 4)

```

### REPLICATE FIGURE A-3

```{r}
# Plot the distribution by Sub-task
sum_irr %>%
  group_by(bootstrap_id, Group, `Sub-task`) %>%
  summarise(fk = mean(fk)) %>%
  mutate(`Sub-task` = factor(`Sub-task`, levels = variables_select)) %>%
  ggplot(aes(x = fk, fill = Group)) + 
  geom_density(alpha = 0.4) +
  facet_wrap(~`Sub-task`, scales = "free_y") +
  xlab("Fleiss' Kappa") +
  labs(fill = "Labeling Methods") +
  theme(legend.position = "top")

ggplot2::ggsave("Figures_Tables/Figure_A-3.pdf", width = 8, height = 6)


```

### REPLICATE FIGURE A-4

```{r}
# Plot the differences: By sub-task
sum_irr_wide_subtask <- sum_irr %>%
  group_by(bootstrap_id, Group, `Sub-task`) %>%
  summarise(fk = mean(fk)) %>%
  pivot_wider(names_from = "Group", values_from = "fk") %>%
  mutate(DeltaKappa = (`I(Multimodal, Multimodal)` - `I(Multimodal, Text-only)`) / `I(Multimodal, Text-only)`) 

sum_irr_wide_subtask_sum <- sum_irr_wide_subtask %>%
  group_by(`Sub-task`) %>%
  summarise(
    mean = mean(DeltaKappa),
    lo = quantile(DeltaKappa, 0.025),
    hi = quantile(DeltaKappa, 0.975)
  ) %>%
  mutate(Metrics = "DeltaKappa", .before = 1)

sum_irr_wide_subtask_sum %>%
  mutate(`Sub-task` = factor(`Sub-task`, levels = rev(variables_select))) %>%
  ggplot(aes(x = `Sub-task`, y = mean, ymin = lo, ymax = hi)) +
  geom_pointrange() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  scale_y_continuous(labels = scales::percent_format()) +
  coord_flip() +
  # Delta I 
  labs(x = "Labels", y = expression(Delta * "I"))

ggsave("Figures_Tables/Figure_A-4.pdf", width = 6, height = 4)

```

### REPLICATE FIGURE A-5

Check whether coders' performance change over time.

```{r}

run_bootstrap <- FALSE 
# Note: Bootstrap results have been run and saved. Change to TRUE to run the bootstrapping
# However, it will be a long process if you do so.

N_BOOTSTRAP = 1000

if (run_bootstrap){
  for (week_current in 1:5){
    message("Week: ", week_current)
    # Check the number of labels per column
    sum_label <- d %>%
      filter(week == !!(week_current)) %>%
      select(`1`:`77`) %>%
      pivot_longer(cols = everything(), names_to = "label", values_to = "value") %>%
      group_by(label) %>%
      summarise(n = sum(value)) %>%
      mutate(label = as.integer(label)) %>%
      arrange(label)
    
    d_irr <- d %>%  
      filter(week == !!(week_current)) %>%
      filter((n_emb == 2 & n_txt == 2)) %>%
      transmute(
        tweet_id = tweet_id, 
        coder = coder,
        treatment = treatment,
        
        Seriousness2 = case_when(
          `1` == 1 ~ "Serious",
          `2` == 1 ~ "Not Serious",
          TRUE ~ "Not Mentioned"
        ),
  
        `State of Economy` = ifelse(`6` == 1, "Mentioned", "Not Mentioned"),
        `Inequality` = ifelse(`7` == 1, "Mentioned", "Not Mentioned"),
        `Policy - Healthcare` = case_when(
          `9` == 1 ~ "Negative",
          `10` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        `Policy - Mask` = case_when(
          `11` == 1 ~ "Negative",
          `12` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        `Policy - Economic Relief` = case_when(
          `21` == 1 ~ "Negative",
          `22` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        `Government Evaluation - Federal` = case_when(
          `28` == 1 ~ "Neutral",
          `29` == 1 ~ "Negative",
          `30` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        `Government Evaluation - Trump` = case_when(
          `31` == 1 ~ "Neutral",
          `32` == 1 ~ "Negative",
          `33` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        `Government Evaluation - Governor` = case_when(
          `34` == 1 ~ "Neutral",
          `35` == 1 ~ "Negative",
          `36` == 1 ~ "Positive",
          TRUE ~ "Not Mentioned"
        ),
        
        `Irrelevant` = case_when(
          `74` == 1 | `75` == 1 | `77` == 1 ~ "Irrelevant",
          TRUE ~ "Not Mentioned"
        ),
        
        `Not enough information` = case_when(
          `76` == 1 ~ "Not enough information",
          TRUE ~ "Not Mentioned"
        )
      )
    
    d_irr_t <- d_irr %>%
      arrange(tweet_id, treatment) %>%
      group_by(tweet_id, treatment) %>%
      mutate(id_within_treatment = row_number(), .after = treatment) %>%
      ungroup() %>%
      mutate(treatment = ifelse(treatment == 1, "Embed", "Text")) %>%
      mutate(coding_id = str_c(treatment, "_", id_within_treatment), .after = id_within_treatment) %>%
      select(-treatment, -id_within_treatment, -coder)
    
    groups <- unique(d_irr_t$coding_id)
    groups_comb <- as.matrix(groups, nrows = 4)
    # groups_comb <- combn(groups, 2, simplify = TRUE)
    
    tweet_id_unique <- unique(d_irr_t$tweet_id)
    variables <- names(d_irr)[-(1:3)]
    
    results <- list()
    
    l <- 1
    i <- 1
    set.seed(23)
    pb <- txtProgressBar(min = 0, max = N_BOOTSTRAP, style = 3)
    
    # Not doing bootstrapping
    for (i in 1:N_BOOTSTRAP){
      tweet_id_boot <- sample(tweet_id_unique, length(tweet_id_unique), replace = TRUE)
      
      d_irr_t_bt <- tibble(tweet_id = tweet_id_boot) %>%
        mutate(uid = row_number(), .before = 1) %>%
        left_join(d_irr_t, by = "tweet_id", relationship = "many-to-many") %>%
        select(-tweet_id)
      
      j <- 1
      for (j in seq_along(variables)){
        variable <- variables[j]
        
        mat <- d_irr_t_bt %>%
          select(uid, coding_id, !!(variable)) %>%
          rename("variable" = !!variable) %>%
          pivot_wider(id_cols = "uid", names_from = "coding_id", values_from = "variable") %>%
          select(-uid)
        
        k <- 1
        for (k in 1:ncol(groups_comb)){
          mat_s <- mat[, groups_comb[,k]]
          fleiss_k <- irr::kappam.fleiss(mat_s, detail = TRUE)
          fk <- fleiss_k$value
          
          results[[l]] <- tibble(
            bootstrap_id = i,
            `Sub-task` = variables[j],
            Coder1 = groups_comb[1,k],
            Coder2 = groups_comb[2,k],
            Coder3 = groups_comb[3,k],
            Coder4 = groups_comb[4,k],
            fk = fk
          )
          l <- l + 1
        }
      }
      setTxtProgressBar(pb, i)
    }
    
    results_df <- bind_rows(results)
    
    write_rds(results_df, sprintf("Data/processed/coder_results_by_week/fleiss_kappa_week%d.rds", week_current))
  }
}


```

```{r}

sum_irr_weekly_ls <- list()
for (i in 1:5){
  sum_irr_weekly_ls[[i]] <- read_rds(sprintf("Data/processed/coder_results_by_week/fleiss_kappa_week%d.rds", i)) %>%
    mutate(week = i, .before = 0)
}

sum_irr_weekly_raw <- bind_rows(sum_irr_weekly_ls)

sum_irr_weekly_raw <- sum_irr_weekly_raw %>%
  mutate(`Sub-task` = recode(`Sub-task`, "Seriousness2" = "Seriousness"))

sum_irr_weekly_raw <- sum_irr_weekly_raw %>% filter(abs(fk) != Inf & fk >= -1)

variables_select <- c("Seriousness", 
                      "State of Economy", "Inequality", 
                      "Policy- Healthcare", "Policy - Mask", "Policy - Economic Relief",
                      "Government Evaluation - Federal", "Government Evaluation - Trump",
                      "Government Evaluation - Governor")

sum_irr_weekly <- sum_irr_weekly_raw


# Calculate 95% CI by week and group
sum_irr_weekly_95CI <- sum_irr_weekly %>%
  group_by(week) %>%
  summarise(mean = mean(fk),
            lo = quantile(fk, 0.025),
            hi = quantile(fk, 0.975)) %>%
  ungroup()

sum_irr_weekly_95CI %>%
  ggplot(aes(x = factor(week), y = mean, ymin = lo, ymax = hi)) +
  geom_pointrange(position = position_dodge(width = 0.2)) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(x = "Week", y = expression("Fleiss' " * kappa)) +
  theme_minimal()

ggsave("Figures_Tables/Figure_A-5.pdf", width = 6, height = 4)
```


### REPLICATE FIGURE A-6

```{r}
p <- sum_classifier_df %>%
  group_by(model_name) %>%
  arrange(desc(f1_macro)) %>%
  filter(row_number() == 1) %>%
  group_by(subset, method) %>%
  summarise(lo = quantile(f1_macro, 0.025), 
            m = quantile(f1_macro, 0.5), 
            hi = quantile(f1_macro, 0.975)) %>%
  mutate(subset = case_match(
    subset,
    "Full" ~ "All Labels",
    " Serious" ~ "Evaluation - Seriousness",
    " PolicyHealthcare" ~ "Policy - Healthcare",
    " Inequality" ~ "Policy - Inequality",
    " EvaluateTrump" ~ "Performance Evaluation - Trump",
    " EvaluateFed" ~ "Performance Evaluation - Federal Gov",
    .default = subset
  )) %>%
  mutate(subset = 
           factor(subset, levels = rev(c("All Labels",
                                        "Evaluation - Seriousness",
                                        "Policy - Healthcare",
                                        "Policy - Inequality",
                                        "Performance Evaluation - Trump",
                                        "Performance Evaluation - Federal Gov")))) %>%
  ggplot(aes(x = subset, y = m, color = method)) +
  geom_errorbar(
    aes(ymin = lo, ymax = hi), 
    width = 0.1,
    position = position_dodge(width = 0.5)
  ) +
  geom_point(position = position_dodge(width = 0.5)) +
  coord_flip() +
  theme_minimal() +
  ylab("Macro F1") + xlab("Outcome") +
  labs(color = "Labeling Methods") +
  theme(legend.position = "top")

ggsave("Figures_Tables/FIGURE_A-6.pdf", p, width = 6, height = 4)

p
```


### REPLICATE FIGURE A-7

```{r, fig.width=7, fig.height=5}
p <- d %>%
  select(tweet_id,`1`:`77`,treatment) %>%
  gather(codebook_id,value, -tweet_id, -treatment) %>%
  mutate(codebook_id = as.numeric(codebook_id)) %>%
  left_join(d_codebook %>% select(codebook_id,label), by = "codebook_id") %>%
  filter(value == 1) %>%
  filter(grepl('Not enough|Serious|State of|Inequality|Healthcare|Masks|Economic Relief|Federal|Trump|Governor',label)) %>%
  # count(label)
  arrange(tweet_id) %>%
  group_by(tweet_id) %>%
  mutate(nei = ifelse(any(codebook_id == 76 & treatment == 0),'nei','valid')) %>%
  ungroup() %>%
  filter(nei != 'valid',
         treatment != 0) %>%
  count(label) %>%
  # filter(n > 10) %>%
  ggplot(aes(x = n,
             y = reorder(label,n))) + 
  geom_bar(stat = 'identity') + 
  labs(x = 'Total number of labels',
       y = 'Labels assigned in multimodal condition',
       title = 'Multimodal labels assigned to "NEI" tweets',
       subtitle = '"Not enough information" tweets labeled in text-only condition')

ggsave("Figures_Tables/FIGURE_A-7.pdf", p, width = 7, height = 5)

p
```